R Tutorial
An introduction to R
Introduction
This tutorial is will introduce the reader to , a free, open-source statistical computing environment often used with RStudio, a integrated development environment for .
R Project Logo
Download
- Download at https://www.r-project.org/
- Download
RStudioat https://rstudio.com/products/rstudio/download/
Calculator
can be used as a super awesome calculator
# 5 + 3 = 8
5 + 3 ## [1] 8
# 24 / (1 + 2) = 8
24 / (1 + 2) ## [1] 8
# 2 * 2 * 2 = 8
2^3 ## [1] 8
# 8 * 8 = 64
sqrt(64) ## [1] 8
# -log10(0.05 / 5000000) = 8
-log10(0.05 / 5000000) ## [1] 8
Functions
has many useful built in functions
1:10## [1] 1 2 3 4 5 6 7 8 9 10
as.character(1:10)## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
rep(1:2, times = 5)## [1] 1 2 1 2 1 2 1 2 1 2
rep(1:5, times = 2)## [1] 1 2 3 4 5 1 2 3 4 5
rep(1:5, each = 2)## [1] 1 1 2 2 3 3 4 4 5 5
rep(1:5, length.out = 7)## [1] 1 2 3 4 5 1 2
seq(5, 50, by = 5)## [1] 5 10 15 20 25 30 35 40 45 50
seq(5, 50, length.out = 5)## [1] 5.00 16.25 27.50 38.75 50.00
paste(1:10, 20:30, sep = "-")## [1] "1-20" "2-21" "3-22" "4-23" "5-24" "6-25" "7-26" "8-27" "9-28" "10-29" "1-30"
paste(1:10, collapse = "-")## [1] "1-2-3-4-5-6-7-8-9-10"
paste0("x", 1:10)## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "x8" "x9" "x10"
min(1:10)## [1] 1
max(1:10)## [1] 10
range(1:10)## [1] 1 10
mean(1:10)## [1] 5.5
sd(1:10)## [1] 3.02765
Custom Functions
Users can also create their own functions
customFunction1 <- function(x, y) {
z <- 100 * x / (x + y)
paste(z, "%")
}
customFunction1(x = 10, y = 90)## [1] "10 %"
customFunction2 <- function(x) {
mymin <- mean(x - sd(x))
mymax <- mean(x) + sd(x)
print(paste("Min =", mymin))
print(paste("Max =", mymax))
}
customFunction2(x = 1:10)## [1] "Min = 2.47234964590251"
## [1] "Max = 8.52765035409749"
for loops and if else
statements
xx <- NULL #creates and empty object
for(i in 1:10) {
xx[i] <- i*3
}
xx## [1] 3 6 9 12 15 18 21 24 27 30
xx %% 2 #gives the remainder when divided by 2## [1] 1 0 1 0 1 0 1 0 1 0
for(i in 1:length(xx)) {
if((xx[i] %% 2) == 0) {
print(paste(xx[i],"is Even"))
} else {
print(paste(xx[i],"is Odd"))
}
}## [1] "3 is Odd"
## [1] "6 is Even"
## [1] "9 is Odd"
## [1] "12 is Even"
## [1] "15 is Odd"
## [1] "18 is Even"
## [1] "21 is Odd"
## [1] "24 is Even"
## [1] "27 is Odd"
## [1] "30 is Even"
# or
ifelse(xx %% 2 == 0, "Even", "Odd")## [1] "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even" "Odd" "Even"
paste(xx, ifelse(xx %% 2 == 0, "is Even", "is Odd"))## [1] "3 is Odd" "6 is Even" "9 is Odd" "12 is Even" "15 is Odd" "18 is Even" "21 is Odd" "24 is Even" "27 is Odd" "30 is Even"
Objects
Information can be stored in user defined objects, in multiple forms:
c(): a string of valuesmatrix(): a two dimensional matrix in one formatdata.frame(): a two dimensional matrix where each column can be a different formatlist():
A string…
xc <- 1:10
xc## [1] 1 2 3 4 5 6 7 8 9 10
xc <- c(1,2,3,4,5,6,7,8,9,10)
xc## [1] 1 2 3 4 5 6 7 8 9 10
A matrix…
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = T)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 11 12 13 14 15 16 17 18 19 20
## [3,] 21 22 23 24 25 26 27 28 29 30
## [4,] 31 32 33 34 35 36 37 38 39 40
## [5,] 41 42 43 44 45 46 47 48 49 50
## [6,] 51 52 53 54 55 56 57 58 59 60
## [7,] 61 62 63 64 65 66 67 68 69 70
## [8,] 71 72 73 74 75 76 77 78 79 80
## [9,] 81 82 83 84 85 86 87 88 89 90
## [10,] 91 92 93 94 95 96 97 98 99 100
xm <- matrix(1:100, nrow = 10, ncol = 10, byrow = F)
xm## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
A data frame…
xd <- data.frame(
x1 = c("aa","bb","cc","dd","ee",
"ff","gg","hh","ii","jj"),
x2 = 1:10,
x3 = c(1,1,1,1,1,2,2,2,3,3),
x4 = rep(c(1,2), times = 5),
x5 = rep(1:5, times = 2),
x6 = rep(1:5, each = 2),
x7 = seq(5, 50, by = 5),
x8 = log10(1:10),
x9 = (1:10)^3,
x10 = c(T,T,T,F,F,T,T,F,F,F)
)
xd## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
A list…
xl <- list(xc, xm, xd)
xl[[1]]## [1] 1 2 3 4 5 6 7 8 9 10
xl[[2]]## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 11 21 31 41 51 61 71 81 91
## [2,] 2 12 22 32 42 52 62 72 82 92
## [3,] 3 13 23 33 43 53 63 73 83 93
## [4,] 4 14 24 34 44 54 64 74 84 94
## [5,] 5 15 25 35 45 55 65 75 85 95
## [6,] 6 16 26 36 46 56 66 76 86 96
## [7,] 7 17 27 37 47 57 67 77 87 97
## [8,] 8 18 28 38 48 58 68 78 88 98
## [9,] 9 19 29 39 49 59 69 79 89 99
## [10,] 10 20 30 40 50 60 70 80 90 100
xl[[3]]## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 1 aa 1 1 1 1 1 5 0.0000000 1 TRUE
## 2 bb 2 1 2 2 1 10 0.3010300 8 TRUE
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
## 4 dd 4 1 2 4 2 20 0.6020600 64 FALSE
## 5 ee 5 1 1 5 3 25 0.6989700 125 FALSE
## 6 ff 6 2 2 1 3 30 0.7781513 216 TRUE
## 7 gg 7 2 1 2 4 35 0.8450980 343 TRUE
## 8 hh 8 2 2 3 4 40 0.9030900 512 FALSE
## 9 ii 9 3 1 4 5 45 0.9542425 729 FALSE
## 10 jj 10 3 2 5 5 50 1.0000000 1000 FALSE
Selecting Data
xc[5] # 5th element in xc## [1] 5
xd$x3[5] # 5th element in col "x3"## [1] 1
xd[5,"x3"] # row 5, col "x3"## [1] 1
xd$x3 # all of col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[,"x3"] # all rows, col "x3"## [1] 1 1 1 1 1 2 2 2 3 3
xd[3,] # row 3, all cols## x1 x2 x3 x4 x5 x6 x7 x8 x9 x10
## 3 cc 3 1 1 3 2 15 0.4771213 27 TRUE
xd[c(2,4),c("x4","x5")] # rows 2 & 4, cols "x4" & "x5"## x4 x5
## 2 2 2
## 4 2 4
xl[[3]]$x1 # 3rd object in the list, col "x1## [1] "aa" "bb" "cc" "dd" "ee" "ff" "gg" "hh" "ii" "jj"
regexpr
xx <- data.frame(Name = c("Item 1 (detail 1)",
"Item 20 (detail 20)",
"Item 300 (detail 300)"),
Item = NA,
Detail = NA)
xx$Detail <- substr(xx$Name, regexpr("\\(", xx$Name)+1, regexpr("\\)", xx$Name)-1)
xx$Item <- substr(xx$Name, 1, regexpr("\\(", xx$Name)-2)
xx## Name Item Detail
## 1 Item 1 (detail 1) Item 1 detail 1
## 2 Item 20 (detail 20) Item 20 detail 20
## 3 Item 300 (detail 300) Item 300 detail 300
Data Formats
Data can also be saved in many formats:
- numeric
- integer
- character
- factor
- logical
xd$x3 <- as.character(xd$x3)
xd$x3## [1] "1" "1" "1" "1" "1" "2" "2" "2" "3" "3"
xd$x3 <- as.numeric(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
xd$x3 <- as.factor(xd$x3)
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 1 2 3
xd$x3 <- factor(xd$x3, levels = c("3","2","1"))
xd$x3## [1] 1 1 1 1 1 2 2 2 3 3
## Levels: 3 2 1
xd$x10## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
as.numeric(xd$x10) # TRUE = 1, FALSE = 0## [1] 1 1 1 0 0 1 1 0 0 0
sum(xd$x10)## [1] 5
Internal structure of an object can be checked with
str()
str(xc) # c()## num [1:10] 1 2 3 4 5 6 7 8 9 10
str(xm) # matrix()## int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
str(xd) # data.frame()## 'data.frame': 10 obs. of 10 variables:
## $ x1 : chr "aa" "bb" "cc" "dd" ...
## $ x2 : int 1 2 3 4 5 6 7 8 9 10
## $ x3 : Factor w/ 3 levels "3","2","1": 3 3 3 3 3 2 2 2 1 1
## $ x4 : num 1 2 1 2 1 2 1 2 1 2
## $ x5 : int 1 2 3 4 5 1 2 3 4 5
## $ x6 : int 1 1 2 2 3 3 4 4 5 5
## $ x7 : num 5 10 15 20 25 30 35 40 45 50
## $ x8 : num 0 0.301 0.477 0.602 0.699 ...
## $ x9 : num 1 8 27 64 125 216 343 512 729 1000
## $ x10: logi TRUE TRUE TRUE FALSE FALSE TRUE ...
str(xl) # list()## List of 3
## $ : num [1:10] 1 2 3 4 5 6 7 8 9 10
## $ : int [1:10, 1:10] 1 2 3 4 5 6 7 8 9 10 ...
## $ :'data.frame': 10 obs. of 10 variables:
## ..$ x1 : chr [1:10] "aa" "bb" "cc" "dd" ...
## ..$ x2 : int [1:10] 1 2 3 4 5 6 7 8 9 10
## ..$ x3 : num [1:10] 1 1 1 1 1 2 2 2 3 3
## ..$ x4 : num [1:10] 1 2 1 2 1 2 1 2 1 2
## ..$ x5 : int [1:10] 1 2 3 4 5 1 2 3 4 5
## ..$ x6 : int [1:10] 1 1 2 2 3 3 4 4 5 5
## ..$ x7 : num [1:10] 5 10 15 20 25 30 35 40 45 50
## ..$ x8 : num [1:10] 0 0.301 0.477 0.602 0.699 ...
## ..$ x9 : num [1:10] 1 8 27 64 125 216 343 512 729 1000
## ..$ x10: logi [1:10] TRUE TRUE TRUE FALSE FALSE TRUE ...
Packages
Additional libraries can be installed and loaded for use.
install.packages("scales")library(scales)
xx <- data.frame(Values = 1:10)
xx$Rescaled <- rescale(x = xx$Values, to = c(1,30))
xx## Values Rescaled
## 1 1 1.000000
## 2 2 4.222222
## 3 3 7.444444
## 4 4 10.666667
## 5 5 13.888889
## 6 6 17.111111
## 7 7 20.333333
## 8 8 23.555556
## 9 9 26.777778
## 10 10 30.000000
libraries can also be used without having to load them
scales::rescale(1:10, to = c(1,30))## [1] 1.000000 4.222222 7.444444 10.666667 13.888889 17.111111 20.333333 23.555556 26.777778 30.000000
Data Wrangling
R for Data Science - https://r4ds.had.co.nz/
xx <- data.frame(Group = c("X","X","Y","Y","Y","X","X","X","Y","Y"),
Data1 = 1:10,
Data2 = seq(10, 100, by = 10))
xx$NewData1 <- xx$Data1 + xx$Data2
xx$NewData2 <- xx$Data1 * 1000
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 X 6 60 66 6000
## 7 X 7 70 77 7000
## 8 X 8 80 88 8000
## 9 Y 9 90 99 9000
## 10 Y 10 100 110 10000
xx$Data1 < 5 # which are less than 5## [1] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
xx[xx$Data1 < 5,]## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx[xx$Group == "X", c("Group","Data2","NewData1")]## Group Data2 NewData1
## 1 X 10 11
## 2 X 20 22
## 6 X 60 66
## 7 X 70 77
## 8 X 80 88
Data wrangling with tidyverse and pipes
(%>%)
library(tidyverse) # install.packages("tidyverse")
xx <- data.frame(Group = c("X","X","Y","Y","Y","Y","Y","X","X","X")) %>%
mutate(Data1 = 1:10,
Data2 = seq(10, 100, by = 10),
NewData1 = Data1 + Data2,
NewData2 = Data1 * 1000)
xx## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
## 5 Y 5 50 55 5000
## 6 Y 6 60 66 6000
## 7 Y 7 70 77 7000
## 8 X 8 80 88 8000
## 9 X 9 90 99 9000
## 10 X 10 100 110 10000
filter(xx, Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Data1 < 5)## Group Data1 Data2 NewData1 NewData2
## 1 X 1 10 11 1000
## 2 X 2 20 22 2000
## 3 Y 3 30 33 3000
## 4 Y 4 40 44 4000
xx %>% filter(Group == "X") %>%
select(Group, NewColName=Data2, NewData1)## Group NewColName NewData1
## 1 X 10 11
## 2 X 20 22
## 3 X 80 88
## 4 X 90 99
## 5 X 100 110
xs <- xx %>%
group_by(Group) %>%
summarise(Data2_mean = mean(Data2),
Data2_sd = sd(Data2),
NewData2_mean = mean(NewData2),
NewData2_sd = sd(NewData2))
xs## # A tibble: 2 × 5
## Group Data2_mean Data2_sd NewData2_mean NewData2_sd
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 X 60 41.8 6000 4183.
## 2 Y 50 15.8 5000 1581.
xx %>% left_join(xs, by = "Group")## Group Data1 Data2 NewData1 NewData2 Data2_mean Data2_sd NewData2_mean NewData2_sd
## 1 X 1 10 11 1000 60 41.83300 6000 4183.300
## 2 X 2 20 22 2000 60 41.83300 6000 4183.300
## 3 Y 3 30 33 3000 50 15.81139 5000 1581.139
## 4 Y 4 40 44 4000 50 15.81139 5000 1581.139
## 5 Y 5 50 55 5000 50 15.81139 5000 1581.139
## 6 Y 6 60 66 6000 50 15.81139 5000 1581.139
## 7 Y 7 70 77 7000 50 15.81139 5000 1581.139
## 8 X 8 80 88 8000 60 41.83300 6000 4183.300
## 9 X 9 90 99 9000 60 41.83300 6000 4183.300
## 10 X 10 100 110 10000 60 41.83300 6000 4183.300
Read/Write data
xx <- read.csv("data_r_tutorial.csv")
write.csv(xx, "data_r_tutorial.csv", row.names = F)For excel sheets, the package readxl can be used to read
in sheets of data.
library(readxl) # install.packages("readxl")
xx <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data")Tidy Data
- Tutorial 1 - https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html
- Tutorial 2 - https://r4ds.had.co.nz/tidy-data.html
yy <- xx %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = round(mean(DTF),1)) %>%
arrange(Location)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name Location Mean_DTF
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Location, value = Mean_DTF)
yy## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Jessore, Bangladesh` `Metaponto, Italy` `Saskatoon, Canada`
## <chr> <dbl> <dbl> <dbl>
## 1 CDC Maxim AGL 86.7 134. 52.5
## 2 ILL 618 AGL 79.3 138. 47
## 3 Laird AGL 76.8 137. 56.8
yy <- yy %>% gather(key = TraitName, value = Value, 2:4)
yy## # A tibble: 9 × 3
## # Groups: Name [3]
## Name TraitName Value
## <chr> <chr> <dbl>
## 1 CDC Maxim AGL Jessore, Bangladesh 86.7
## 2 ILL 618 AGL Jessore, Bangladesh 79.3
## 3 Laird AGL Jessore, Bangladesh 76.8
## 4 CDC Maxim AGL Metaponto, Italy 134.
## 5 ILL 618 AGL Metaponto, Italy 138.
## 6 Laird AGL Metaponto, Italy 137.
## 7 CDC Maxim AGL Saskatoon, Canada 52.5
## 8 ILL 618 AGL Saskatoon, Canada 47
## 9 Laird AGL Saskatoon, Canada 56.8
yy <- yy %>% spread(key = Name, value = Value)
yy## # A tibble: 3 × 4
## TraitName `CDC Maxim AGL` `ILL 618 AGL` `Laird AGL`
## <chr> <dbl> <dbl> <dbl>
## 1 Jessore, Bangladesh 86.7 79.3 76.8
## 2 Metaponto, Italy 134. 138. 137.
## 3 Saskatoon, Canada 52.5 47 56.8
Base Plotting
We will start with some basic plotting using the base function
plot()
# A basic scatter plot
plot(x = xd$x8, y = xd$x9)# Adjust color and shape of the points
plot(x = xd$x8, y = xd$x9, col = "darkred", pch = 0)plot(x = xd$x8, y = xd$x9, col = xd$x4, pch = xd$x4)# Adjust plot type
plot(x = xd$x8, y = xd$x9, type = "line")# Adjust linetype
plot(x = xd$x8, y = xd$x9, type = "line", lty = 2)# Plot lines and points
plot(x = xd$x8, y = xd$x9, type = "both")Now lets create some random and normally distributed data to make some more complicated plots
# 100 random uniformly distributed numbers ranging from 0 - 100
ru <- runif(100, min = 0, max = 100)
ru## [1] 30.9121781 32.1829267 42.2647010 16.1754229 23.5625172 44.3682141 78.3270444 25.7793888 35.2511881 14.7144228 60.7532697 24.1566737
## [13] 19.3458974 21.8807855 40.6646676 73.3036560 64.6003054 55.7392779 5.1260773 66.8000103 52.5462892 54.8444951 5.6287848 40.5857535
## [25] 18.2331620 28.7362442 74.4939378 85.1302364 40.7311611 84.3526014 24.4060324 99.3290434 91.3838138 2.5456418 93.2412341 27.9620192
## [37] 70.7720552 0.3032020 83.2623216 45.9158561 96.2404347 12.6587489 21.6559721 2.1091698 3.2899364 93.3489808 47.9674401 85.6927624
## [49] 56.9991541 7.8762359 88.6662240 86.7447810 72.3717036 94.8826073 59.5257210 36.5135615 60.3472919 76.4650013 7.9059204 23.8739987
## [61] 80.3618527 47.4768994 26.6809901 85.2657579 30.2989461 67.4636522 4.4168441 62.6265713 92.9572082 40.2092617 91.5908662 79.9284620
## [73] 61.2257808 18.1624205 6.9392750 75.3634843 92.1931106 15.6638308 45.0579940 56.0695942 25.4014636 78.2999106 65.5475422 36.5985935
## [85] 97.9526651 20.7850832 38.7123019 25.1284228 23.7965369 49.0322230 20.3433656 92.3234679 95.3255589 81.1288768 58.1396914 0.9308145
## [97] 25.6982458 96.9178776 61.8373739 55.4002804
plot(x = ru)order(ru)## [1] 38 96 44 34 45 67 19 23 75 50 59 42 10 78 4 74 25 13 91 86 43 14 5 89 60 12 31 88 81 97 8 63 36 26 65
## [36] 1 2 9 56 84 87 70 24 15 29 3 6 79 40 62 47 90 21 22 100 18 80 49 95 55 57 11 73 99 68 17 83 20 66 37
## [71] 53 16 27 76 58 82 7 72 61 94 39 30 28 64 48 52 51 33 71 77 92 69 35 46 54 93 41 98 85 32
ru<- ru[order(ru)]
ru## [1] 0.3032020 0.9308145 2.1091698 2.5456418 3.2899364 4.4168441 5.1260773 5.6287848 6.9392750 7.8762359 7.9059204 12.6587489
## [13] 14.7144228 15.6638308 16.1754229 18.1624205 18.2331620 19.3458974 20.3433656 20.7850832 21.6559721 21.8807855 23.5625172 23.7965369
## [25] 23.8739987 24.1566737 24.4060324 25.1284228 25.4014636 25.6982458 25.7793888 26.6809901 27.9620192 28.7362442 30.2989461 30.9121781
## [37] 32.1829267 35.2511881 36.5135615 36.5985935 38.7123019 40.2092617 40.5857535 40.6646676 40.7311611 42.2647010 44.3682141 45.0579940
## [49] 45.9158561 47.4768994 47.9674401 49.0322230 52.5462892 54.8444951 55.4002804 55.7392779 56.0695942 56.9991541 58.1396914 59.5257210
## [61] 60.3472919 60.7532697 61.2257808 61.8373739 62.6265713 64.6003054 65.5475422 66.8000103 67.4636522 70.7720552 72.3717036 73.3036560
## [73] 74.4939378 75.3634843 76.4650013 78.2999106 78.3270444 79.9284620 80.3618527 81.1288768 83.2623216 84.3526014 85.1302364 85.2657579
## [85] 85.6927624 86.7447810 88.6662240 91.3838138 91.5908662 92.1931106 92.3234679 92.9572082 93.2412341 93.3489808 94.8826073 95.3255589
## [97] 96.2404347 96.9178776 97.9526651 99.3290434
plot(x = ru)# 100 normally distributed numbers with a mean of 50 and sd of 10
nd <- rnorm(100, mean = 50, sd = 10)
nd## [1] 65.36887 46.07190 47.55528 58.03504 55.74271 68.70367 47.42043 34.51012 53.66058 56.87554 32.38311 59.83192 53.48288 50.65946 61.20076
## [16] 57.71043 51.32132 55.63966 63.21350 53.95949 48.69658 56.81119 60.64028 75.92070 56.78021 28.63463 47.13149 46.78780 51.44414 41.54465
## [31] 33.11895 36.94242 49.99023 41.34040 46.70975 55.09973 49.22609 84.21313 42.53301 56.85120 47.30672 55.08106 41.72929 46.08051 44.67917
## [46] 61.58831 69.09002 71.54366 40.76796 53.41917 63.12178 56.13592 51.08316 67.01438 58.97346 39.90550 59.36576 48.28227 42.93408 49.61655
## [61] 58.33197 62.65112 37.55822 49.71131 51.45015 58.88540 32.60638 51.77198 61.17037 54.19280 58.03662 27.62846 34.21848 55.14100 48.71374
## [76] 77.12900 41.65744 40.66324 44.66201 43.82592 38.10102 47.43570 61.89061 51.04434 46.90748 47.73211 44.41603 56.42132 29.53994 33.47079
## [91] 41.82246 21.90596 51.29507 63.48882 48.49959 47.19184 44.93486 51.13405 53.65798 47.36845
nd <- nd[order(nd)]
nd## [1] 21.90596 27.62846 28.63463 29.53994 32.38311 32.60638 33.11895 33.47079 34.21848 34.51012 36.94242 37.55822 38.10102 39.90550 40.66324
## [16] 40.76796 41.34040 41.54465 41.65744 41.72929 41.82246 42.53301 42.93408 43.82592 44.41603 44.66201 44.67917 44.93486 46.07190 46.08051
## [31] 46.70975 46.78780 46.90748 47.13149 47.19184 47.30672 47.36845 47.42043 47.43570 47.55528 47.73211 48.28227 48.49959 48.69658 48.71374
## [46] 49.22609 49.61655 49.71131 49.99023 50.65946 51.04434 51.08316 51.13405 51.29507 51.32132 51.44414 51.45015 51.77198 53.41917 53.48288
## [61] 53.65798 53.66058 53.95949 54.19280 55.08106 55.09973 55.14100 55.63966 55.74271 56.13592 56.42132 56.78021 56.81119 56.85120 56.87554
## [76] 57.71043 58.03504 58.03662 58.33197 58.88540 58.97346 59.36576 59.83192 60.64028 61.17037 61.20076 61.58831 61.89061 62.65112 63.12178
## [91] 63.21350 63.48882 65.36887 67.01438 68.70367 69.09002 71.54366 75.92070 77.12900 84.21313
plot(x = nd)hist(x = nd)hist(nd, breaks = 20, col = "darkgreen")plot(x = density(nd))boxplot(x = nd)boxplot(x = nd, horizontal = T)ggplot2
Lets be honest, the base plots are ugly! The ggplot2
package gives the user to create a better, more visually appealing
plots. Additional packages such as ggbeeswarm and
ggrepel also contain useful functions to add to the
functionality of ggplot2.
- ggplot2 - https://ggplot2.tidyverse.org/
- Tutorial 1 - http://r-statistics.co/ggplot2-Tutorial-With-R.html
- Tutorial 2 - https://www.statsandr.com/blog/graphics-in-r-with-ggplot2/
- The R Graph Gallery - https://www.r-graph-gallery.com/ggplot2-package.html
library(ggplot2)
mp <- ggplot(xd, aes(x = x8, y = x9))
mp + geom_point()mp + geom_point(aes(color = x3, shape = x3), size = 4)mp + geom_line(size = 2)mp + geom_line(aes(color = x3), size = 2)mp + geom_smooth(method = "loess")mp + geom_smooth(method = "lm")xx <- data.frame(data = c(rnorm(50, mean = 40, sd = 10),
rnorm(50, mean = 60, sd = 5)),
group = factor(rep(1:2, each = 50)),
label = c("Label1", rep(NA, 49), "Label2", rep(NA, 49)))
mp <- ggplot(xx, aes(x = data, fill = group))
mp + geom_histogram(color = "black")mp + geom_histogram(color = "black", position = "dodge")mp1 <- mp + geom_histogram(color = "black") + facet_grid(group~.)
mp1mp + geom_density(alpha = 0.5)mp <- ggplot(xx, aes(x = group, y = data, fill = group))
mp + geom_boxplot(color = "black")mp + geom_boxplot() + geom_point()mp + geom_violin() + geom_boxplot(width = 0.1, fill = "white")library(ggbeeswarm)
mp + geom_quasirandom()mp + geom_quasirandom(aes(shape = group))mp2 <- mp + geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
geom_beeswarm(alpha = 0.5)
library(ggrepel)
mp2 + geom_text_repel(aes(label = label), nudge_x = 0.4)library(ggpubr)
ggarrange(mp1, mp2, ncol = 2, widths = c(2,1),
common.legend = T, legend = "bottom")Statistics
- Handbook of Biological Statistics - http://biostathandbook.com/
- R Companion for ^ - https://rcompanion.org/rcompanion/a_02.html
# Prep data
lev_Loc <- c("Saskatoon, Canada", "Jessore, Bangladesh", "Metaponto, Italy")
lev_Name <- c("ILL 618 AGL", "CDC Maxim AGL", "Laird AGL")
dd <- read_xlsx("data_r_tutorial.xlsx", sheet = "Data") %>%
mutate(Location = factor(Location, levels = lev_Loc),
Name = factor(Name, levels = lev_Name))
xx <- dd %>%
group_by(Name, Location) %>%
summarise(Mean_DTF = mean(DTF))
xx %>% spread(Location, Mean_DTF)## # A tibble: 3 × 4
## # Groups: Name [3]
## Name `Saskatoon, Canada` `Jessore, Bangladesh` `Metaponto, Italy`
## <fct> <dbl> <dbl> <dbl>
## 1 ILL 618 AGL 47 79.3 138.
## 2 CDC Maxim AGL 52.5 86.7 134.
## 3 Laird AGL 56.8 76.8 137.
# Plot
mp1 <- ggplot(dd, aes(x = Location, y = DTF, color = Name, shape = Name)) +
geom_point(size = 2, alpha = 0.7, position = position_dodge(width=0.5))
mp2 <- ggplot(xx, aes(x = Location, y = Mean_DTF,
color = Name, group = Name, shape = Name)) +
geom_point(size = 2.5, alpha = 0.7) +
geom_line(size = 1, alpha = 0.7) +
theme(legend.position = "top")
ggarrange(mp1, mp2, ncol = 2, common.legend = T, legend = "top")From first glace, it is clear there are differences between genotypes, locations, and genotype x environment (GxE) interactions. Now let’s do a few statistical tests.
summary(aov(DTF ~ Name * Location, data = dd))## Df Sum Sq Mean Sq F value Pr(>F)
## Name 2 88 44 3.476 0.0395 *
## Location 2 65863 32931 2598.336 < 2e-16 ***
## Name:Location 4 560 140 11.044 2.52e-06 ***
## Residuals 45 570 13
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As expected, an ANOVA shows statistical significance for genotype (p-value = 0.0395), Location (p-value < 2e-16) and GxE interactions (p-value < 2.52e-06). However, all this tells us is that one genotype is different from the rest, one location is different from the others and that there is GxE interactions. If we want to be more specific, would need to do some multiple comparison tests.
If we only have two things to compare, we could do a t-test.
xx <- dd %>%
filter(Location %in% c("Saskatoon, Canada", "Jessore, Bangladesh")) %>%
spread(Location, DTF)
t.test(x = xx$`Saskatoon, Canada`, y = xx$`Jessore, Bangladesh`)##
## Welch Two Sample t-test
##
## data: xx$`Saskatoon, Canada` and xx$`Jessore, Bangladesh`
## t = -17.521, df = 32.701, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -32.18265 -25.48402
## sample estimates:
## mean of x mean of y
## 52.11111 80.94444
DTF in Saskatoon, Canada is significantly different (p-value < 2.2e-16) from DTF in Jessore, Bangladesh.
xx <- dd %>%
filter(Name %in% c("ILL 618 AGL", "Laird AGL"),
Location == "Metaponto, Italy") %>%
spread(Name, DTF)
t.test(x = xx$`ILL 618 AGL`, y = xx$`Laird AGL`)##
## Welch Two Sample t-test
##
## data: xx$`ILL 618 AGL` and xx$`Laird AGL`
## t = 0.38008, df = 8.0564, p-value = 0.7137
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -5.059739 7.059739
## sample estimates:
## mean of x mean of y
## 137.8333 136.8333
DTF between ILL 618 AGL and Laird AGL are not significantly different (p-value = 0.7137) in Metaponto, Italy.
pch Plot
xx <- data.frame(x = rep(1:6, times = 5, length.out = 26),
y = rep(5:1, each = 6, length.out = 26),
pch = 0:25)
mp <- ggplot(xx, aes(x = x, y = y, shape = as.factor(pch))) +
geom_point(color = "darkred", fill = "darkblue", size = 5) +
geom_text(aes(label = pch), nudge_x = -0.25) +
scale_shape_manual(values = xx$pch) +
scale_x_continuous(breaks = 6:1) +
scale_y_continuous(breaks = 6:1) +
theme_void() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
axis.text = element_blank(),
axis.ticks = element_blank()) +
labs(title = "Plot symbols in R (pch)",
subtitle = "color = \"darkred\", fill = \"darkblue\"",
x = NULL, y = NULL)
ggsave("pch.png", mp, width = 4.5, height = 3, bg = "white")R Markdown
Tutorials on how to create an R markdown document like this one can be found here: